# Importing essential libraries for data manipulation and visualization.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Loading the Netflix dataset into a DataFrame for analysis.
netflix_titles=pd.read_csv('netflix_titles.csv')
# Displaying the first few rows of the dataset to preview its structure and contents.
netflix_titles.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
# Determining the size of the dataset to understand its scale.
netflix_titles.shape
(8807, 12)
# Listing all column names to understand the features available.
netflix_titles.columns
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
'release_year', 'rating', 'duration', 'listed_in', 'description'],
dtype='object')
# Checking for missing values in each column to assess data completeness.
netflix_titles.isnull().sum()
show_id 0 type 0 title 0 director 2634 cast 825 country 831 date_added 10 release_year 0 rating 4 duration 3 listed_in 0 description 0 dtype: int64
# Checking for duplicate entries to ensure data uniqueness.
netflix_titles.duplicated().sum()
0
# Filling missing values with appropriate defaults or placeholders.
netflix_titles['director'].fillna('Unknown', inplace=True)
netflix_titles['cast'].fillna('Unknown', inplace=True)
country_mode = netflix_titles['country'].mode()[0]
netflix_titles['country'].fillna(country_mode, inplace=True)
date_added_mode = netflix_titles['date_added'].mode()[0]
netflix_titles['date_added'].fillna(date_added_mode, inplace=True)
rating_mode =netflix_titles['rating'].mode()[0]
netflix_titles['rating'].fillna(rating_mode, inplace=True)
netflix_titles['duration'].fillna('Unknown', inplace=True)
# Confirming the absence of null values after cleaning.
netflix_titles.isnull().sum()
show_id 0 type 0 title 0 director 0 cast 0 country 0 date_added 0 release_year 0 rating 0 duration 0 listed_in 0 description 0 dtype: int64
# Verifying the size of the cleaned dataset.
netflix_titles.shape
(8807, 12)
# Previewing the cleaned dataset's first few rows.
netflix_titles.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | Unknown | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | Unknown | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | United States | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | Unknown | Unknown | United States | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | Unknown | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
# Displaying data types and non-null counts to understand the dataset's structure.
netflix_titles.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 8807 non-null object 4 cast 8807 non-null object 5 country 8807 non-null object 6 date_added 8807 non-null object 7 release_year 8807 non-null int64 8 rating 8807 non-null object 9 duration 8807 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: int64(1), object(11) memory usage: 825.8+ KB
# Creating a pie chart to visualize the distribution of Movies and TV Shows.
colors = px.colors.qualitative.Set1
fig = px.pie(netflix_titles, names='type', title='Netflix Shows by Movie & TV Shows',
hole=0.4, color_discrete_sequence=colors, labels={'type': 'Show Type'})
fig.update_layout(
showlegend=True,
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
annotations=[dict(text='Total Shows', x=0.5, y=0.5, font_size=20, showarrow=False)])
# Preparing data for a country-wise count of shows and movies.
netflix_titles_country = netflix_titles[['country', 'type']].groupby(by=['country', 'type']).size().reset_index(name='count').sort_values(by='count', ascending=False)
# Creating a bar plot to show the top 10 countries by content count, split by type.
colors = px.colors.qualitative.Set1
fig = px.bar(netflix_titles_country.head(10), x='country', y='count', color='type',
title='Top 10 Netflix Shows/Movies Count by Country and Type',
labels={'count': 'Show/Movie Count', 'country': 'Country'},
color_discrete_sequence=colors)
fig.update_layout(
xaxis_title='Country',
yaxis_title='Show/Movie Count',
legend_title='Show Type',
legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
barmode='group',
margin=dict(l=50, r=50, t=70, b=50))
# Group titles by director and count their respective title numbers.
netflix_titles_directors = netflix_titles.groupby('director').size().reset_index(name='title_count')
# Create a treemap of the top 5 directors with the most titles on Netflix, using color to represent title count.
px.treemap(
netflix_titles_directors.head(5),
path=['director'],
values='title_count',
title='Top 5 Directors with Most Titles on Netflix',
color='title_count',
color_continuous_scale='viridis',
hover_name='director',
labels={'title_count': 'Title Count'},
height=400,
width=800,)
# Convert 'date_added' to datetime, and extract 'year_added' and 'month_added' for further analysis.
netflix_titles['date_added']=pd.to_datetime(netflix_titles['date_added'])
netflix_titles['year_added'] = netflix_titles['date_added'].dt.year
netflix_titles['month_added'] = netflix_titles['date_added'].dt.month
# Fill missing values in 'month_added' and 'year_added' with their respective mode values.
month_mode = netflix_titles['month_added'].mode()[0]
netflix_titles['month_added'].fillna(month_mode, inplace=True)
year_mode = netflix_titles['year_added'].mode()[0]
netflix_titles['year_added'].fillna(year_mode, inplace=True)
# display the count of missing values in each column after data cleaning.
netflix_titles.isnull().sum()
show_id 0 type 0 title 0 director 0 cast 0 country 0 date_added 0 release_year 0 rating 0 duration 0 listed_in 0 description 0 year_added 0 month_added 0 dtype: int64
# Count and sort the number of titles added each year.
titles_per_year = netflix_titles['year_added'].value_counts().sort_index()
# Convert the index of 'titles_per_year' to integer type for consistent year representation.
titles_per_year.index = titles_per_year.index.astype(int)
# Plot a line graph showing the trend of titles added to Netflix over time with styling and grid.
plt.figure(figsize=(12, 6))
titles_per_year.plot(kind='line', color='skyblue', marker='o', linewidth=2, markersize=8)
plt.title('Number of Titles Added to Netflix Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Titles Added')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Plot KDEs to compare the monthly and yearly distribution of movies and TV shows added to Netflix.
plt.figure(figsize=(7, 2))
sns.kdeplot(data=netflix_titles[netflix_titles['type'] == 'Movie'], x='month_added', common_norm=False, fill=True, label='Movie')
sns.kdeplot(data=netflix_titles[netflix_titles['type'] == 'TV Show'], x='month_added', common_norm=False, fill=True, label='TV Show')
plt.title("Distribution of movie and TV show releases over the months")
plt.xlabel("Month added to Netflix")
plt.ylabel("Density")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.figure(figsize=(7, 2))
sns.kdeplot(data=netflix_titles[netflix_titles['type'] == 'Movie'], x='year_added', common_norm=False, fill=True, label='Movie')
sns.kdeplot(data=netflix_titles[netflix_titles['type'] == 'TV Show'], x='year_added', common_norm=False, fill=True, label='TV Show')
plt.title("Distribution of movie and TV show releases over the years")
plt.xlabel("Year added to Netflix")
plt.ylabel("Density")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
<matplotlib.legend.Legend at 0x2ca3cf3dc10>
# Count rows where the 'rating' column contains time durations
time_duration_count = netflix_titles['rating'].str.contains('min', na=False).sum()
print(time_duration_count )
3
# Replace entries containing time durations in the 'rating' column with 'Unknown'
netflix_titles.loc[netflix_titles['rating'].str.contains('min', na=False), 'rating'] = 'Unknown'
# Visualize the distribution of Netflix titles by rating with a count plot.
custom_palette = sns.color_palette("viridis", n_colors=len(netflix_titles['rating'].value_counts()))
plt.figure(figsize=(12, 6))
sns.countplot(x='rating', data=netflix_titles, order=netflix_titles['rating'].value_counts().index, palette=custom_palette)
plt.xticks(rotation=45, ha="right")
plt.title('Rating distribution of shows and movies on Netflix', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
for p in plt.gca().patches:
plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10, color='black')
plt.gca().set_facecolor('#f5f5f5')
# Create a count plot showing the relationship between content type and rating.
plt.figure(figsize=(12, 8))
sns.countplot(x='rating', hue='type', data=netflix_titles, palette='pastel' ,edgecolor='black', linewidth=1.2)
plt.title('Relation Between Type and Rating', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.legend(title='Type', title_fontsize='14', loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.gca().set_facecolor('#ecf0f1')
plt.gca().spines['top'].set_color('black')
plt.gca().spines['right'].set_color('black')
plt.gca().spines['bottom'].set_color('black')
plt.gca().spines['left'].set_color('black')
# Plot a scatter graph to visualize the distribution and trends of ratings over the years, differentiated by content type.
plt.figure(figsize=(12, 7))
sns.scatterplot(x='year_added', y='rating', hue='type', data=netflix_titles, palette='Set1', alpha=0.7, s=100)
plt.title('Rating Trends Over the Years', fontsize=18)
plt.xlabel('Year Added', fontsize=14)
plt.ylabel('Rating', fontsize=14)
plt.legend(title='Type', title_fontsize=14, loc='upper left', bbox_to_anchor=(1, 1))
plt.grid(axis='both', linestyle='--', alpha=0.5)
plt.gca().set_facecolor('#f9f9f9')
plt.tick_params(axis='both', which='both', direction='in', length=6, width=1, colors='black')
legend_labels = plt.gca().get_legend().get_texts()
# Split and explode the 'listed_in' column to create a series of individual genres from the
netflix_genres = netflix_titles['listed_in'].str.split(', ').explode().reset_index(drop=True)
# Create a new DataFrame with 'release_year' and individual genres for each Netflix title.
netflix_titles_genere = pd.DataFrame({'release_year': netflix_titles['release_year'], 'genere': netflix_genres})
# Group the new DataFrame by 'release_year' and 'genere', and count occurrences to analyze genre trends over time.
netflix_titles_genere = netflix_titles_genere.groupby(['release_year', 'genere']).size().reset_index(name='count')
# For each release year, identify the top 5 genres based on count, creating a DataFrame of these leading genres over time.
top_5 = netflix_titles_genere.groupby('release_year').apply(lambda x: x.nlargest(5, 'count', 'all')).reset_index(drop=True)
# Convert the 'release_year' column to integer type for consistent year representation.
top_5['release_year'] = top_5['release_year'].astype(int)
# Create an interactive sunburst chart to visualize the top 5 genres each year with color representing their count.
fig = px.sunburst(
top_5,
title='Top 5 Genres Each Year',
path=['release_year', 'genere'],
values='count',
color='count',
color_continuous_scale='RdBu',
hover_name='genere'
)
fig.update_layout(
width=600,
height=500,
margin=dict(l=0, r=0, t=40, b=20),
font=dict(family='Arial', size=12, color='black')
)
fig.update_coloraxes(colorbar=dict(title='Count', tickfont=dict(size=10)))
fig.update_traces(
textinfo='label+percent parent',
hoverinfo='all'
)
# Create a binary matrix indicating the presence of each genre in the Netflix titles.
genre_matrix = netflix_titles['listed_in'].str.get_dummies(', ').astype(int)
# Append the 'type' column (Movie/TV Show) to the genre matrix for differentiated analysis.
genre_matrix['type'] = netflix_titles['type']
# Aggregate genre counts by content type (Movie/TV Show) to analyze genre distribution.
genre_counts_by_type = genre_matrix.groupby('type').sum()
# Identify the top 5 genres across all Netflix titles based on their overall counts.
top_5_genres = genre_counts_by_type.sum().sort_values(ascending=False).head(5).index.tolist()
# Isolate counts for the top 5 genres for comparative analysis between Movies and TV Shows.
genre_counts_top_5 = genre_counts_by_type[top_5_genres]
# Visualize the top 5 genres across Movies and TV Shows with a horizontal stacked bar chart, customized colors, and styling.
plt.figure(figsize=(12, 8))
sns.set_style("white")
colors = sns.color_palette("pastel", n_colors=len(top_5_genres))
ax = genre_counts_top_5.plot(kind='barh', stacked=True, color=colors, edgecolor='k', linewidth=0.5)
plt.title('Top 5 Genres in Netflix Titles: Movies vs. TV Shows', fontsize=18, color='navy')
plt.xlabel('Count of Titles', fontsize=14, labelpad=10)
plt.ylabel('Titles', fontsize=14, labelpad=10)
plt.legend(title='Genres', fontsize=10, title_fontsize='11', loc='upper left', bbox_to_anchor=(1, 1))
plt.gca().invert_yaxis()
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', which='major', color='grey', alpha=0.7)
<Figure size 1200x800 with 0 Axes>
# Import the WordCloud library for generating word cloud visualizations.
from wordcloud import WordCloud
# Prepare country data from Netflix titles by removing commas and null values for word cloud generation.
text_data = netflix_titles['country'].dropna().str.replace(',', '')
# Concatenate all country text data into a single string for word cloud processing.
text_data = ' '.join(text_data)
# Generate and display a word cloud for countries in Netflix titles, showcasing the prominence of each country visually.
wordcloud = WordCloud(
width=800,
height=400,
background_color='black',
colormap='viridis',
max_words=100,
contour_width=2,
contour_color='white'
).generate(text_data)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for Countries in Netflix Titles', fontsize=16, color='white')
plt.show()